Since 2008, guests and hosts have used Airbnb to travel in a more unique, personalized way. However, the scope in in this case analysis are only from December 2020 to December 2021. As part of the Airbnb Inside initiative, this dataset describes the listing activity of homestays in Hawaii, Hawaii.
Data Source: http://insideairbnb.com/get-the-data.html
import requests
import os
import zipfile as zp
import gzip
import pandas as pd
import csv
import shutil
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
!git clone https://github.com/eliasezar27/PRESANA.git
base_dir = '/content/PRESANA/hawaii_airBnbGroup'
calendar_csv1 = os.path.join(base_dir, 'calendar_main_part1.csv')
calendar_csv2 = os.path.join(base_dir, 'calendar_main_part2.csv')
calendar_csv3 = os.path.join(base_dir, 'calendar_main_part3.csv')
calendar_csv4 = os.path.join(base_dir, 'calendar_main_part4.csv')
listings_csv = os.path.join(base_dir, 'listings_main.csv')
covid_csv = os.path.join(base_dir, 'hawaii_covid_cases.csv')
listings = pd.read_csv(listings_csv)
listings.info()
calendar_df1 = pd.read_csv(calendar_csv1, parse_dates=['date'])
calendar_df2 = pd.read_csv(calendar_csv2, parse_dates=['date'])
calendar_df3 = pd.read_csv(calendar_csv3, parse_dates=['date'])
calendar_df4 = pd.read_csv(calendar_csv4, parse_dates=['date'])
calendar = pd.concat([calendar_df1, calendar_df2, calendar_df3, calendar_df4])
print('Calendar dataframe Information:\n')
calendar.info()
covid_cases = pd.read_csv(covid_csv, parse_dates=['date'])
covid_cases['cumsum'] = covid_cases['total'].cumsum()
covid_cases.info()
print('Look for null values in the dataframe per column\n')
col_nans = listings.isnull().sum()[listings.isnull().sum() > 0].index.values # Saving all columns names with nan values into a variable
listings.isnull().sum()[listings.isnull().sum() > 0]
# Listing price that are $0.00 and their host name
listings[listings['price'] == 0][['id', 'host_name', 'price']]
# View other listings with same host name with listings priced at $0.00
print('Total istings with host name: Alohilani Resort ', listings[listings['host_name'] == 'Alohilani Resort']['host_name'].count())
print('Total istings with host name: Hilo Hawaiian ', listings[listings['host_name'] == 'Hilo Hawaiian']['host_name'].count())
print('Total istings with host name: Waipouli Beach Resort By Outrigger ', listings[listings['host_name'] == 'Waipouli Beach Resort By Outrigger']['host_name'].count())
# Fill $0.00 priced listing with the median price from the listings that belongs to the same host name
mask1 = listings['id'] == 41740619
mask2 = listings['id'] == 43309266
listings['price'] = listings['price'].mask(mask1, float(listings[listings['host_name'] == 'Alohilani Resort']['price'].median()))
listings['price'] = listings['price'].mask(mask2, float(listings[listings['host_name'] == 'Hilo Hawaiian']['price'].median()))
# Drop listing with $0.00 price and no similar host name
listings = listings.drop([listings.index[listings['id'] == 47816777][0]])
# Dropping columns containing any nan values
listings.drop(columns=col_nans, inplace=True)
listings.dropna(1, inplace=True)
listings.query('neighbourhood_group_cleansed == "Hawaii"', inplace = True)
listings.describe().transpose()
listings.reset_index(drop=True, inplace=True)
print('Look for null values in the dataframe per column\n')
calendar.isnull().sum()
# Fill null in adjusted price column with the price based on the listing price in the listings dataframe.
for i in calendar[calendar['adjusted_price'].isnull()]['listing_id'].unique():
mask = calendar['listing_id'] == i
try:
rep_val = listings[listings['id'] == i]['price'].values[0]
calendar['adjusted_price'] = calendar['adjusted_price'].mask(mask, calendar['adjusted_price'].fillna(rep_val))
except:
pass
# Transform categorical available column into numeric where 'f' is 1 and 't' is 0
calendar['occupied'] = calendar['available'].map({'f': 1, 't': 0})
calendar.rename(columns={'listing_id':'id'}, inplace=True)
calendar = pd.merge(calendar,
listings[['id', 'name', 'neighbourhood_group_cleansed', 'longitude', 'latitude', 'room_type']],
on='id', how='left')
calendar.rename(columns={'neighbourhood_group_cleansed':'county'}, inplace=True)
calendar.rename(columns={'adjusted_price':'price'}, inplace=True)
# Sort calendar dataframe based on date from oldest to latest
calendar = calendar.sort_values(['date', 'id'])
# View the oldest and latest listing history
print("Historical Listings are from:", pd.to_datetime(calendar.date).min(), 'to', pd.to_datetime(calendar.date).max())
calendar.describe().transpose()
calendar.query('county == "Hawaii"', inplace = True)
calendar.reset_index(drop=True, inplace=True)
# Compute AirBnb's Profit per occupied listings
calendar_profit = calendar.query('occupied == 1').assign(profit = lambda x: (x['price'] * 0.03) + (x['price'] * 0.14)).reset_index(drop=True)
# Dataframe for aggregated occupied listing tally and median price per date
calendar_day = calendar.groupby('date', as_index=False)['occupied'].sum()
calendar_day = pd.merge(calendar_day,
calendar_profit.groupby('date', as_index=False)['price'].mean(),
on='date', how='left')
calendar_day = pd.merge(calendar_day,
calendar_profit.groupby('date', as_index=False)['profit'].sum(),
on='date', how='left')
# Dataframe with aggregated occupied listing tally per id listing
calendar_id = calendar.query('occupied == 1').groupby(['id'], as_index=False)['occupied'].sum()
calendar_id = pd.merge(calendar_id,
listings[['id', 'name', 'longitude', 'latitude', 'price', 'room_type']],
on='id', how='left').drop_duplicates('id').reset_index(drop=True)
calendar_id = calendar_id.assign(profit = lambda x: ((x['price'] * 0.03) + (x['price'] * 0.14)) * x['occupied'] ).reset_index(drop=True)
apartment_rm = calendar_profit.query('room_type == "Entire home/apt"').groupby('date', as_index=False)['profit'].mean()
apartment_rm.index = pd.to_datetime(apartment_rm['date'])
apartment_rm.drop(columns=['date'], inplace = True)
apartment_rm = apartment_rm.asfreq('d')
private_rm = calendar_profit.query('room_type == "Private room"').groupby('date', as_index=False)['profit'].mean()
private_rm.index = pd.to_datetime(private_rm['date'])
private_rm.drop(columns=['date'], inplace = True)
private_rm = private_rm.asfreq('d')
shared_rm = calendar_profit.query('room_type == "Shared room"').groupby('date', as_index=False)['profit'].mean()
shared_rm.index = pd.to_datetime(shared_rm['date'])
shared_rm.drop(columns=['date'], inplace = True)
shared_rm = shared_rm.asfreq('d')
hotel_rm = calendar_profit.query('room_type == "Hotel room"').groupby('date', as_index=False)['profit'].mean()
hotel_rm.index = pd.to_datetime(hotel_rm['date'])
hotel_rm.drop(columns=['date'], inplace = True)
hotel_rm = hotel_rm.asfreq('d')
What are the locations of the listed properties in AirBNB around the Hawaii state?
fig = px.scatter_mapbox(calendar_id, lat="latitude", lon="longitude",
hover_name="name", hover_data=["price", "occupied", "profit"],
color="room_type", zoom=7.9, height=600)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.update_layout(legend=dict(
yanchor="top",
y=0.99,
xanchor="left",
x=0.01
))
fig.show()
What Room Type has the most AirBNB listings?
fig = px.bar(calendar_id.groupby(['room_type'], as_index=False).count().sort_values(['id'], ascending=False),
x="room_type", y="id", labels={'room_type': 'Room Type', 'id':'Count'},
title='Number of Listings per Room Type', color = 'room_type')
fig.show()
How frequent was a specific listing was occupied per room type?
What is the average listing occupancy per room type?
fig = px.bar(calendar_id.groupby(['room_type'], as_index=False).mean().sort_values(['occupied'], ascending=False),
x="room_type", y="occupied", labels={'room_type': 'Room Type', 'occupied':'Count'}, hover_data = ['profit'],
title='Average Listing Occupancy per Room Type', color = 'room_type')
fig.show()
What listings are affordable?
fig = px.bar(calendar_id[['name', 'price', 'room_type']].sort_values(['price'], ascending = False).tail(10),
x="price", y="name", orientation='h', hover_name='room_type', color = 'room_type',
title='Top 10 Afordable Listing')
fig.show()
What is the distribution of the listing prices?
fig = px.histogram(calendar_id, x="price",
labels={'profit': 'Price ($)'},
title='Distribution of Listing Price in Hawaii State')
fig.show()
What are the profit ranges for every room types in the listing?
fig = px.box(calendar_id, x="room_type", y="price",
labels={'price': 'Price ($)', 'room_type': 'Room Type'},
title='Listing Prices per Room type', color='room_type')
fig.update_layout(legend=dict(
yanchor="top",
y=0.99,
xanchor="right",
x=.99
))
fig.show()
What is the behavior of the listing profits per day?
fig = px.line(calendar_day, x="date", y="profit",
title='Sum Profits per Day',
labels={'profit': 'Sum Profits ($)'})
fig.show()
What is the renting behavior of all the listings per day?
fig = px.line(calendar_day, x="date", y="occupied",
title='Daily Occupied Listings',
labels={'occupied': 'Total Listings Occupied'})
fig.update_layout(legend=dict(
yanchor="top",
y=0.99,
xanchor="left",
x=.01
))
fig.show()
What is the behavior of the Listings during COVID-19 pandemic?
plt.figure(figsize=(20,2))
plt.title('Confirmed Covid Cases per Day', fontdict={'fontsize': 20})
plt.bar(covid_cases['date'], covid_cases['hawaii'])
plt.show()
plt.figure(figsize=(20,2))
plt.title('Mean Price of All Listings per Day', fontdict={'fontsize': 20})
calendar_day.groupby('date')['price'].mean().plot()
plt.xlabel(None)
plt.show()
plt.figure(figsize=(20,2))
plt.title('Daily Number Occupied Listings', fontdict={'fontsize': 20})
calendar.groupby('date')['occupied'].sum().plot()
plt.xlabel(None)
plt.show()
plt.figure(figsize=(20,2))
plt.title('Daily Average Occupied Listings', fontdict={'fontsize': 20})
calendar.groupby('date')['occupied'].mean().plot()
plt.xlabel(None)
plt.show()
plt.figure(figsize=(20,2))
plt.title('Listing Counts over Time', fontdict={'fontsize': 20})
calendar.groupby('date')['occupied'].count().plot()
plt.show()
covid_cases['cumsum'] = covid_cases['hawaii'].cumsum()
covid_cases2 = covid_cases[covid_cases['date'] >= calendar['date'].min()][['date', 'hawaii', 'cumsum']]
plt.figure(figsize=(20,5))
plt.title('Total Confirmed Covid Cases per day and Number of Daily Occupied Listings', fontdict={'fontsize': 20})
plt.plot(covid_cases2['date'], covid_cases2['cumsum'], label='Cummulative summary of cases', color='orange')
plt.plot(calendar_day.groupby('date')['occupied'].sum(), label='Listings occupied')
plt.legend()
plt.show()
What is the Correlation of Daily Number of Occupied Listings and the Median Price of the Listings per Day?
fig = px.scatter(calendar_day,
x="occupied", y="price",
title='Daily Number of Occupied Listings vs Mean Price of Listings per Day',
labels={'price': 'Mean Prices ($)', 'occupied': 'Total Listings Occupied'})
fig.show()
calendar_day[['occupied', 'price']].corr("pearson")
Exactly –1. A perfect downhill (negative) linear relationship
–0.70. A strong downhill (negative) linear relationship
–0.50. A moderate downhill (negative) relationship
–0.30. A weak downhill (negative) linear relationship
+0.30. A weak uphill (positive) linear relationship
+0.50. A moderate uphill (positive) relationship
+0.70. A strong uphill (positive) linear relationship
Exactly +1. A perfect uphill (positive) linear relationship
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
y = calendar_day[['price']]
x = calendar_day[['occupied']]
lr_model = lr.fit(x, y)
r_sq = lr_model.score(x, y)
print('coefficient of determination (R-squared):', r_sq)
print('Meaning - Only 10.11% of the model fits the observed data')
fig = px.scatter(calendar_day,
x="occupied", y="price",
title='Daily Number of Occupied Listings vs Mean Price of Listings per Day',
trendline='ols', trendline_color_override='red',
labels={'price': 'Mean Prices ($)', 'occupied': 'Total Listings Occupied'})
fig.show()
It implies that the higher the number of occupied listings, the lower the mean of the listing price will be.
calendar_day.index = calendar_day['date']
calendar_day = calendar_day.asfreq('d')
from statsmodels.tsa.stattools import adfuller
def adfTest(data):
print('H0: Data is non-stationary')
print('Ha: Data is stationary \n')
dec = {1: 'fail to reject', 2: 'reject', 3: 'Non-stationary', 4: 'Stationary'}
X = data
result = adfuller(X)#, maxlag=13, autolag='BIC')
print('ADF Statistic/ Test Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('No. of lags: %d' % result[2])
print('Num of obs: %d' % result[3])
print('Critical Values:')
for key, value in result[4].items():
con = dec[2] if result[0] < value else dec[1]
print('\t%s: %.3f, therefore,' % (key, value), con, 'null hypothesis')
print('\nFor a 5% critical value, the data is', dec[3] if con == dec[1] else dec[4])
H0: Data is non-stationary
Ha: Data is stationary
if Test Statistic < Critical Values => Rejects the null hypothesis.
if Test Statistic > Critical Values => failed to reject the null hypothesis.
adfTest(calendar_day['profit'])
def globVSloc(data):
from math import sqrt
# Cochran's Sample Size
n0 = int(((1.96)**2 * (0.5) * (1 - 0.5)) / (0.05)**2)
n = int(n0 /(1+(( n0 - 1 )/ len(data) )))
dec_mean = {1: 'Reject H0', 2: 'Fail to reject H0'}
global_mean = abs(data.mean())
global_sd = sqrt(data.var())
local_mean = abs(data[:n].mean())
z_range = [-1.63, 1.63]
z_val = (local_mean - global_mean)/(global_sd/sqrt(n))
hyp = dec_mean[1] if z_val < z_range[0] or z_val > z_range[1] else dec_mean[2]
dec_mean = 'Stationary' if hyp is not 'Reject H0' else 'Not Stationary'
print('Global Mean: {:,.2f}'.format(global_mean))
print('Global Standard Deviation: {:,.2f}'.format(global_sd))
print('Number of Samples: {:,.2f}'.format(n))
print('Local Mean: {:,.2f}'.format(local_mean))
print('\nHypothesis Testing using z\n')
print('H0: Global Mean == Local Mean')
print('Ha: Global Mean != Local Mean')
print('Two-tailed Test')
print('Confidence Level: 0.95')
print('z range: -1.63 --- 1.63')
print('z score is: ', z_val)
print('Therefore,', hyp)
print('Data is:', dec_mean)
globVSloc(calendar_day['profit'])
import statsmodels.api as sm
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.graphics.api import qqplot
def corr(df):
r,q,p = sm.tsa.acf(df, fft=True, qstat=True, nlags=36)
r2 = sm.tsa.pacf(df, nlags=36)
data = np.c_[range(1,37), r[1:], r2[1:], q, p]
table = pd.DataFrame(data, columns=['lag', "AC", "PAC", "Q", "Prob"])
print(table.set_index('lag'))
fig = plt.figure(figsize=(24,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(df, lags=36, ax=ax1, zero=False)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(df, lags=36, ax=ax2, zero=False)
plt.show()
corr(calendar_day['profit'])
def firstlvldiff(df):
first_level = [0]
for i in range(len(df['profit'])):
if i > 0:
first_level.append(df['profit'][i] - df['profit'][i-1])
df['frst_lvl_dff'] = first_level
df[['frst_lvl_dff']].plot(figsize=(15, 7))
plt.show()
return df
calendar_day = firstlvldiff(calendar_day)
adfTest(calendar_day['frst_lvl_dff'])
globVSloc(calendar_day['frst_lvl_dff'])
corr(calendar_day['frst_lvl_dff'])
import warnings
import itertools
warnings.filterwarnings("ignore")
p = [1,2] # AR = PCF
d = [1] # diffrence
q = [1] # MA = ACF
pdq = list(itertools.product(p, d, q))
print("PDQ are", pdq)
aic_list = []
bic_list = []
hqic_list = []
pdq_models = []
from statsmodels.tsa.arima_model import ARIMA
for param in pdq:
try:
mod = ARIMA(calendar_day['frst_lvl_dff'], order=param)
results = mod.fit()
# AIC - Aikake, BIC - schwarz criterion, HQIC - Hannan-Quinn
print('ARIMA{}\tAIC:{:,}\t BIC:{:,}\t HQIC:{:,}'.format(param, round(results.aic, 2), round(results.bic, 2), round(results.hqic, 2)))
pdq_models.append(param)
aic_list.append(results.aic)
bic_list.append(results.bic)
hqic_list.append(results.hqic)
except:
continue
print('ARIMA model with lowest AIC', pdq_models[aic_list.index(min(aic_list))])
print('ARIMA model with lowest BIC', pdq_models[bic_list.index(min(bic_list))])
print('ARIMA model with lowest HQIC', pdq_models[hqic_list.index(min(hqic_list))])
mod211 = ARIMA(calendar_day['profit'], order=(2, 1, 1))
arima211 = mod211.fit()
# Make Predictions until 2022 February
arima211_pred = arima211.predict("2020-12-27", "2022-02-1", typ='levels')
# Get Confidence Intervals of the new unobserved dates
arima211_conf_int = pd.DataFrame(arima211.forecast(59)[2], columns=['min', 'max'])
ind_slice = len(arima211_pred) - len(arima211_conf_int)
arima211_conf_int.index = arima211_pred.index[ind_slice:]
# Check Residuals
def check_resid(arima_model):
resids = arima_model.resid
r,q,p = sm.tsa.acf(resids, fft=True, qstat=True, nlags=36)
r2 = sm.tsa.pacf(resids, nlags=36)
data = np.c_[range(1,37), r[1:], r2[1:], q, p]
table = pd.DataFrame(data, columns=['lag', "AC", "PAC", "Q", "Prob"])
print(table.set_index('lag'))
fig = plt.figure(figsize=(24,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(resids, lags=36, ax=ax1, zero=False)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(resids, lags=36, ax=ax2, zero=False)
check_resid(arima211)
from math import sqrt
from sklearn.metrics import mean_squared_error
rmse = sqrt(mean_squared_error(calendar_day['profit'][8:350], arima211_pred[:342]))
print('Test RMSE: {:,.3f}'.format(rmse))
What will be the trend of the AirBNB profit in Hawaii, Hawaii?
def show_plot(df, pred, conf, tit = "Profits"):
fig, ax = plt.subplots(figsize=(30,10))
plt.title(tit + ': Original Observation vs ARIMA Prediction', {'fontsize': 50})
plt.plot(df['profit'], label='Original Obs', color='green', linewidth=3)
plt.plot(pred, label='Prediction until 1st of February 2022', color='red', linestyle='dashed')
plt.plot(conf['min'], label='Prediction\'s Confidence Interval', color='blue')
plt.plot(conf['max'], color='blue')
plt.legend(prop={'size': 20})
plt.show()
show_plot(calendar_day, arima211_pred, arima211_conf_int)
plt.figure(figsize=(15,6))
apartment_rm['profit'].plot()
plt.show()
adfTest(apartment_rm['profit'])
globVSloc(apartment_rm['profit'])
corr(apartment_rm['profit'])
apartment_rm = firstlvldiff(apartment_rm)
adfTest(apartment_rm['frst_lvl_dff'])
globVSloc(apartment_rm['frst_lvl_dff'])
corr(apartment_rm['frst_lvl_dff'])
p = [1,2] # AR = PCF
d = [1] # diffrence
q = [1] # MA = ACF
pdq = list(itertools.product(p, d, q))
print("PDQ are", pdq)
aic_list = []
bic_list = []
hqic_list = []
pdq_models = []
from statsmodels.tsa.arima_model import ARIMA
for param in pdq:
try:
mod = ARIMA(calendar_day['frst_lvl_dff'], order=param)
results = mod.fit()
# AIC - Aikake, BIC - schwarz criterion, HQIC - Hannan-Quinn
print('ARIMA{}\tAIC:{:,}\t BIC:{:,}\t HQIC:{:,}'.format(param, round(results.aic, 2), round(results.bic, 2), round(results.hqic, 2)))
pdq_models.append(param)
aic_list.append(results.aic)
bic_list.append(results.bic)
hqic_list.append(results.hqic)
except:
continue
print('ARIMA model with lowest AIC', pdq_models[aic_list.index(min(aic_list))])
print('ARIMA model with lowest BIC', pdq_models[bic_list.index(min(bic_list))])
print('ARIMA model with lowest HQIC', pdq_models[hqic_list.index(min(hqic_list))])
mod211 = ARIMA(apartment_rm['profit'], order=(2, 1, 1))
apart_arima211 = mod211.fit()
# Make Predictions until 2022 February
apart_arima211_pred = apart_arima211.predict("2020-12-27", "2022-02-1", typ='levels')
# Get Confidence Intervals of the new unobserved dates
apart_arima211_conf_int = pd.DataFrame(apart_arima211.forecast(59)[2], columns=['min', 'max'])
ind_slice = len(apart_arima211_pred) - len(apart_arima211_conf_int)
apart_arima211_conf_int.index = apart_arima211_pred.index[ind_slice:]
check_resid(apart_arima211)
rmse = sqrt(mean_squared_error(apartment_rm['profit'][8:350], apart_arima211_pred[:342]))
print('Test RMSE: {:,.3f}'.format(rmse))
show_plot(apartment_rm, apart_arima211_pred, apart_arima211_conf_int, "Profits from Apartment Listings")
plt.figure(figsize=(15,6))
private_rm['profit'].plot()
plt.show()
adfTest(private_rm['profit'])
globVSloc(private_rm['profit'])
corr(private_rm['profit'])
private_rm = firstlvldiff(private_rm)
adfTest(private_rm['frst_lvl_dff'])
globVSloc(private_rm['frst_lvl_dff'])
corr(private_rm['frst_lvl_dff'])
p = [1,2,5] # AR = PCF
d = [1] # diffrence
q = [1,2,5] # MA = ACF
pdq = list(itertools.product(p, d, q))
print("PDQ are", pdq)
aic_list = []
bic_list = []
hqic_list = []
pdq_models = []
from statsmodels.tsa.arima_model import ARIMA
for param in pdq:
try:
mod = ARIMA(calendar_day['frst_lvl_dff'], order=param)
results = mod.fit()
# AIC - Aikake, BIC - schwarz criterion, HQIC - Hannan-Quinn
print('ARIMA{}\tAIC:{:,}\t BIC:{:,}\t HQIC:{:,}'.format(param, round(results.aic, 2), round(results.bic, 2), round(results.hqic, 2)))
pdq_models.append(param)
aic_list.append(results.aic)
bic_list.append(results.bic)
hqic_list.append(results.hqic)
except:
continue
print('ARIMA model with lowest AIC', pdq_models[aic_list.index(min(aic_list))])
print('ARIMA model with lowest BIC', pdq_models[bic_list.index(min(bic_list))])
print('ARIMA model with lowest HQIC', pdq_models[hqic_list.index(min(hqic_list))])
mod512 = ARIMA(private_rm['profit'], order=(5, 1, 2))
priv_arima512 = mod512.fit()
# Make Predictions until 2022 February
priv_arima512_pred = priv_arima512.predict("2020-12-27", "2022-02-1", typ='levels')
# Get Confidence Intervals of the new unobserved dates
priv_arima512_conf_int = pd.DataFrame(priv_arima512.forecast(59)[2], columns=['min', 'max'])
ind_slice = len(priv_arima512_pred) - len(priv_arima512_conf_int)
priv_arima512_conf_int.index = priv_arima512_pred.index[ind_slice:]
check_resid(priv_arima512)
rmse = sqrt(mean_squared_error(private_rm['profit'][8:350], priv_arima512_pred[:342]))
print('Test RMSE: {:,.3f}'.format(rmse))
show_plot(private_rm, priv_arima512_pred, priv_arima512_conf_int, "Profits from Private Room Listings")
plt.figure(figsize=(15,6))
shared_rm['profit'].plot()
plt.show()
adfTest(shared_rm['profit'])
globVSloc(shared_rm['profit'])
corr(shared_rm['profit'])
shared_rm = firstlvldiff(shared_rm)
adfTest(shared_rm['frst_lvl_dff'])
globVSloc(shared_rm['frst_lvl_dff'])
corr(shared_rm['frst_lvl_dff'])
p = [1,2,6] # AR = PCF
d = [1] # diffrence
q = [1,6] # MA = ACF
pdq = list(itertools.product(p, d, q))
print("PDQ are", pdq)
aic_list = []
bic_list = []
hqic_list = []
pdq_models = []
from statsmodels.tsa.arima_model import ARIMA
for param in pdq:
try:
mod = ARIMA(calendar_day['frst_lvl_dff'], order=param)
results = mod.fit()
# AIC - Aikake, BIC - schwarz criterion, HQIC - Hannan-Quinn
print('ARIMA{}\tAIC:{:,}\t BIC:{:,}\t HQIC:{:,}'.format(param, round(results.aic, 2), round(results.bic, 2), round(results.hqic, 2)))
pdq_models.append(param)
aic_list.append(results.aic)
bic_list.append(results.bic)
hqic_list.append(results.hqic)
except:
continue
print('ARIMA model with lowest AIC', pdq_models[aic_list.index(min(aic_list))])
print('ARIMA model with lowest BIC', pdq_models[bic_list.index(min(bic_list))])
print('ARIMA model with lowest HQIC', pdq_models[hqic_list.index(min(hqic_list))])
mod211 = ARIMA(shared_rm['profit'], order=(2, 1, 1))
shrd_arima211 = mod211.fit()
# Make Predictions until 2022 February
shrd_arima211_pred = shrd_arima211.predict("2020-12-27", "2022-02-1", typ='levels')
# Get Confidence Intervals of the new unobserved dates
shrd_arima211_conf_int = pd.DataFrame(shrd_arima211.forecast(59)[2], columns=['min', 'max'])
ind_slice = len(shrd_arima211_pred) - len(shrd_arima211_conf_int)
shrd_arima211_conf_int.index = shrd_arima211_pred.index[ind_slice:]
check_resid(shrd_arima211)
rmse = sqrt(mean_squared_error(shared_rm['profit'][8:350], shrd_arima211_pred[:342]))
print('Test RMSE: {:,.3f}'.format(rmse))
show_plot(shared_rm, shrd_arima211_pred, shrd_arima211_conf_int, "Profits from Shared Room Listings")
plt.figure(figsize=(15,6))
hotel_rm['profit'].plot()
plt.show()
adfTest(hotel_rm['profit'])
globVSloc(hotel_rm['profit'])
corr(hotel_rm['profit'])
hotel_rm = firstlvldiff(hotel_rm)
adfTest(hotel_rm['frst_lvl_dff'])
globVSloc(hotel_rm['frst_lvl_dff'])
corr(hotel_rm['frst_lvl_dff'])
p = [1,2,3,4] # AR = PCF
d = [1] # diffrence
q = [1,2] # MA = ACF
pdq = list(itertools.product(p, d, q))
print("PDQ are", pdq)
aic_list = []
bic_list = []
hqic_list = []
pdq_models = []
from statsmodels.tsa.arima_model import ARIMA
for param in pdq:
try:
mod = ARIMA(calendar_day['frst_lvl_dff'], order=param)
results = mod.fit()
# AIC - Aikake, BIC - schwarz criterion, HQIC - Hannan-Quinn
print('ARIMA{}\tAIC:{:,}\t BIC:{:,}\t HQIC:{:,}'.format(param, round(results.aic, 2), round(results.bic, 2), round(results.hqic, 2)))
pdq_models.append(param)
aic_list.append(results.aic)
bic_list.append(results.bic)
hqic_list.append(results.hqic)
except:
continue
print('ARIMA model with lowest AIC', pdq_models[aic_list.index(min(aic_list))])
print('ARIMA model with lowest BIC', pdq_models[bic_list.index(min(bic_list))])
print('ARIMA model with lowest HQIC', pdq_models[hqic_list.index(min(hqic_list))])
mod412 = ARIMA(hotel_rm['profit'], order=(4, 1, 2))
hot_arima412 = mod412.fit()
# Make Predictions until 2022 February
hot_arima412_pred = hot_arima412.predict("2020-12-27", "2022-02-1", typ='levels')
# Get Confidence Intervals of the new unobserved dates
hot_arima412_conf_int = pd.DataFrame(hot_arima412.forecast(59)[2], columns=['min', 'max'])
ind_slice = len(hot_arima412_pred) - len(hot_arima412_conf_int)
hot_arima412_conf_int.index = hot_arima412_pred.index[ind_slice:]
check_resid(hot_arima412)
rmse = sqrt(mean_squared_error(hotel_rm['profit'][8:350], hot_arima412_pred[:342]))
print('Test RMSE: {:,.3f}'.format(rmse))
show_plot(hotel_rm, hot_arima412_pred, hot_arima412_conf_int, "Profits from Hotel Room Listings")
# Get the predicted Feb 1, 2022 profit per listing
apartProfit = apart_arima211_pred[-1]
privProfit = priv_arima512_pred[-1]
sharedProfit = shrd_arima211_pred[-1]
hotelProfit = hot_arima412_pred[-1]
print('Predicted Profit per Room Type in February 1, 2022: \n')
print('Apartment/Entire Home Listing: ${0:.2f}'.format(apartProfit))
print('Private Room Listing: ${0:.2f}'.format(privProfit))
print('Shared Room Listing: ${0:.2f}'.format(sharedProfit))
print('Hotel Room Listing: ${0:.2f}'.format(hotelProfit))
Supposed that there is an AirBNB outbound process that offer users promo and good deals for a lodging place in Hawaii, Hawaii. As part of the AirBNB team, we try to find the maximum profit from an outbound process per customer service representative given the following constraints:
Limited outgoing call per room type listings they should process during their 8 hour shift:
For listings that offers Apartment/Entire Home for $55.76 : 20 mins
For listings that offers Private Room for $34.11 : 18 mins
For listings that offers Shared Room for $6.69 : 13 mins
For listings that offers Hotel Room for $37.75 : 15 mins
Given that they should process a minimum of 1 listing per room type.
!pip install pulp
from pulp import *
#Elementary features:
lp = LpProblem("Outbound Profit", LpMaximize)
#Define variables
x1 = LpVariable(name="Apartment", lowBound=1, cat="Integer")
x2 = LpVariable(name="Private Room", lowBound=1, cat="Integer")
x3 = LpVariable(name="Shared Room", lowBound=1, cat="Integer")
x4 = LpVariable(name="Hotel Room", lowBound=1, cat="Integer")
#Add the objective function
lp += (apartProfit * x1 )+ (privProfit * x2) + (sharedProfit * x3) + (hotelProfit * x4)
#print(lp.objective)
# Add the constraints
lp += (20 * x1 + 18 * x2 + 13 * x3 + 15 * x4 <= 480, "shift time")
#print(lp.constraints)
# Solve the LP
status = lp.solve(PULP_CBC_CMD(msg=0))
#print("Status:", status) #1:optimal, 2:not solved, 3:infeasible, 4:unbounded, 5:undef
print('Optimal Solution\n')
#Print solution
for var in lp.variables():
print("Outbound process for:", var, "should be", int(value(var)))
print("\nPossible Max Profit from a single CSR is ${0:.2f} for an outbound process of listings in Hawaii, Hawaii".format(value(lp.objective)))